// vim: set syntax=asm :

{% include "fma_mmm_ymm_scalar.tmpliq" label:"scalar_min", op:"vminps", from:from, to:to, type:type%}
{% include "fma_mmm_ymm_scalar.tmpliq" label:"scalar_max", op:"vmaxps", from:from, to:to, type:type%}
{% include "fma_mmm_ymm_scalar.tmpliq" label:"scalar_add", op:"vaddps", from:from, to:to, type:type%}
{% include "fma_mmm_ymm_scalar.tmpliq" label:"scalar_mul", op:"vmulps", from:from, to:to, type:type%}
{% include "fma_mmm_ymm_scalar.tmpliq" label:"scalar_sub", op:"vsubps", from:from, to:to, type:type%}
{% include "fma_mmm_ymm_scalar.tmpliq" label:"scalar_sub_flipped", op:"vsubps", from:from, to:to, flipped: true, type:type%}

{{L}}leaky_relu:
    // can only use ymm12 to ymm15
    // ymm15 <- alpha
    {% if type == "f32" %}
        vbroadcastss    ymm15, dword ptr [rdi + 8]
    {% else %}
        pinsrw          xmm15, word ptr [rdi + 8], 0
        vcvtph2ps       ymm15, xmm15
        vbroadcastss    ymm15, xmm15
    {% endif %}

    // ymm14 <- all zero
    vpxor           ymm14, ymm14, ymm14

    {% for reg in (from..to) %}
        // ymm12 <- alpha * x
        vmulps      ymm12, ymm{{reg}}, ymm15
        vcmpps     ymm13, ymm14, ymm{{reg}}, 1 // 1 means LT
        vblendvps   ymm{{reg}}, ymm12, ymm{{reg}}, ymm13
    {% endfor %}
    // select muled of orginal

    jmp    {{L}}non_linear_loop

{{L}}q_scale:
{{L}}q_shl:
{{L}}q_shr:
    jmp {{L}}unsupported

